The aim is to explore the annotations and label transfers for all of the samples in SCPCP000006.
In order to explore the label transfer results, we look into some marker genes, table and percentages of cells in each annotation groups (from label transfers).
# The base path for the OpenScPCA repository, found by its (hidden) .git directory
repository_base <- rprojroot::find_root(rprojroot::is_git_root)
# The current data directory, found within the repository base directory
data_dir <- file.path(repository_base, "data", "current", params$scpca_project_id)
# The path to this module
module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06")
result_dir <- file.path(module_base, "results")In this notebook, we are working with all of the samples in SCPCP000006.
We work with the pre-processed and labeled Seurat object
that is the output of
02b_label-transfer_fetal_kidney_reference_Stewart.Rmd saved
in the results directory.
The sample metadata can be found in sample_metadata_file
in the data folder.
sample_metadata_file <- file.path(repository_base, "data", "current", params$scpca_project_id, "single_cell_metadata.tsv")
metadata <- read.table(sample_metadata_file, sep = "\t", header = TRUE)To explore the annotation results, we look into some marker genes reported here:
The report will be saved in the notebook directory.
Here we defined function that will be used multiple time all along the notebook.
For a Seurat object objectand a metadata
metadata, the function visualize_metadata will
plot FeaturePlot and BarPlot
object is the Seurat object
metadata the gene or quantitative value to be
plotted
group.by is the metadata used for grouping the
violin plots
visualize_metadata <- function(object, meta, group.by){
if(is.numeric(object@meta.data[,meta])){
d <- SCpubr::do_FeaturePlot(object,
features = meta,
reduction = "umap.rpca",
pt.size = 0.2,
legend.width = 0.5,
legend.length = 5,
legend.position = "right") + ggtitle(meta)
b <- SCpubr::do_ViolinPlot(srat,
features = meta,
ncol = 1,
group.by = group.by,
legend.position = "none")
return(d + b + plot_layout(ncol = 2, widths = c(2,4)))
}
else{
d <- SCpubr::do_DimPlot(object, reduction="umap.rpca", group.by = meta, label = TRUE, repel = TRUE) + ggtitle(paste0(meta," - umap")) + theme(text=element_text(size=18)) + NoLegend()
b <- SCpubr::do_BarPlot(sample = object,
group.by = meta,
split.by = group.by,
position = "fill",
font.size = 10,
legend.ncol = 3) +
ggtitle("% cells")+
xlab(print(group.by)) +
theme(text=element_text(size=18))
return(d + b + plot_layout(ncol = 1, heights = c(4,2)))
}
}For a Seurat object objectand a features
features, the function visualize_feature will
plot FeaturePlot and ViolinPlot
object is the Seurat object
features the gene or quantitative value to be
plotted
group.by is the metadata used for grouping the
violin plots
visualize_feature <- function(object, features, group.by = "seurat_clusters"){
feature_symbol <- AnnotationDbi::select(org.Hs.eg.db,
keys=features,
columns="SYMBOL",
keytype="ENSEMBL")
d <- SCpubr::do_FeaturePlot(object,
features = feature_symbol$ENSEMBL,
reduction = "umap.rpca",
pt.size = 0.2,
legend.width = 0.5,
legend.length = 5,
legend.position = "right") + ggtitle(feature_symbol$SYMBOL)
b <- SCpubr::do_ViolinPlot(srat,
features = feature_symbol$ENSEMBL,
ncol = 1,
group.by = group.by,
legend.position = "none",
assay = "SCT") + ylab(feature_symbol$SYMBOL)
return(d + b + plot_layout(ncol = 2, widths = c(2,4)))
}do_Table_Heatmap shows heatmap of counts of cells for
combinations of two metadata variables
data seurat objectfirst_group is the name of the first metadata to group
the cellslast_group is the name of the second metadata to group
the cellsdo_Table_Heatmap <- function(data, first_group, last_group ){
df <- data@meta.data %>%
mutate_if(sapply(data@meta.data, is.character), as.factor) %>%
group_by( !!sym(first_group), !!sym(last_group))%>%
summarise(Nb = log(n()))
p <- ggplot(df, aes(x= !!sym(first_group), y = !!sym(last_group), fill = Nb)) +
geom_tile() +
scale_fill_viridis_c() +
theme_bw() +
theme(text = element_text(size = 20)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
guides(fill=guide_legend(title=" Number of \n cells (log)"))
return(p)
}Seurat object# open the processed rds object
filelist <- list.files(result_dir,
recursive = TRUE,
pattern = "02b-fetal_kidney_label-transfer_",
full.names = TRUE)
sample_id <- list.files(result_dir,
recursive = TRUE,
pattern = "02b-fetal_kidney_label-transfer_",
full.names = FALSE)
sample_id <- colsplit(sample_id, pattern = "/", names = c("sample_id", "suffixe"))
sample_id <- sample_id[,1]
sce <- lapply(filelist, function(x) readRDS(x))Seurat objects (with integration)We followed the standard Seurat workflow described here
to merge and integrate the 40 samples from the Wilms tumor dataset
SCPCP000006. For more information, see the vignette Merge
objects (with integration). We have slight changes as we already
performed normalization using SCTransform.
options(future.globals.maxSize= 8912896000000)
srat <- merge(sce[[1]], y = sce[2:length(sce)],
add.cell.ids = sample_id,
project = params$scpca_project_id)
DefaultAssay(object = srat) <- "SCT"
VariableFeatures(srat[["SCT"]]) <- rownames(srat[["SCT"]]@scale.data)
srat <- RunPCA(srat)
srat <- IntegrateLayers(object = srat,
method = RPCAIntegration,
orig.reduction = "pca",
new.reduction = 'rpca',
normalization.method="SCT", verbose = FALSE)
srat <- FindNeighbors(srat, dims = 1:30, reduction = "rpca")
srat <- FindClusters(srat)## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 200338
## Number of edges: 7344017
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8989
## Number of communities: 42
## Elapsed time: 135 seconds
This is not the main point of this notebook, we expect to work on integration later on, one of the final step of the analysis. We simply here would like to have a rough idea of cell/sample distribution.
tmp <- colsplit(colnames(srat), pattern = "_", names = c("sample_id", "barcode"))
srat <- AddMetaData(srat, tmp$sample_id, col.name = "sample_id")
d1 <- SCpubr::do_DimPlot(srat, reduction="umap.rpca", group.by = "sample_id", label = FALSE) + ggtitle("sample_id - umap.rpca reduction")
d2 <- SCpubr::do_DimPlot(srat, reduction="umap.rpca", group.by = "seurat_clusters", label = TRUE, repel = TRUE) + ggtitle("seurat_clusters - umap.rpca reduction")
d1 | d2The use of the right reference is crucial for label transfer and annotation. It is recommended that the cell types in the reference is representative to the cell types to be annotated in the query.
Wilms tumors can contain up to three histologies that resemble fetal kidney: blastema, stroma, and epithelia [1, 2]. Because of their histological similarity to fetal kidneys, Wilms tumors are thought to arise from developmental derangements in embryonic renal progenitors.
For these reasons, we expect Wilms tumor cells to map to fetal cells, especially fetal kidney cells.
Here, we investigate annotations resulting from 4 different strategies:
the _processed.rds objects already contain automated
annotations computed by the Data
Lab:
Annotations determined by SingleR, an automated reference-based method Looney et al. 2019.
Annotations determined by CellAssign, an automated marker-gene based method Zhang et al. 2019.
the label transfers from two fetal references using
Azimuth, that have been performed in the previous
analysis
Using the human fetal atlas as a reference: This is the reference
developed by Cao et
al. provided by Azimuth for label transfer. The
reference contain cells from 15 organs including kidney from fetal
samples. The label transfer have been performed in the notebook
02a_fetal_full_reference_Cao_{sample_id}.Rmd
Using the human fetal kidney atlas: Stewart et
al. created a human fetal
kidney atlas. This reference contains only fetal kidney cells and
has been precisely annotated by kidney experts. The label transfer have
been performed in the notebook
02b_fetal_kidney_reference_Stewart_{sample_id}.Rmd
## [1] "seurat_clusters"
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "seurat_clusters"
## Warning: ggrepel: 20 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
The fetal full reference (Cao et al.) provides three levels of annotations:
fetal_full_predicted.organ is one of the 15 organs
used for building the reference. We were nicely surprised that, for many
of the samples, most of the cells from the Wilms tumor cohort mapped to
fetal kidney cells.
fetal_full_predicted.annotation.l1 holds predicted
cell type annotation
fetal_full_predicted.annotation.l2 holds the
combination of the predicted organ and predicted cell type
annotation
## [1] "seurat_clusters"
Of note, 85.4925176 % of the cells are labeled as kidney cells.
## [1] "seurat_clusters"
## Warning: ggrepel: 30 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## [1] "seurat_clusters"
## Warning: ggrepel: 115 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
The fetal kidney reference (Stewart et al.) provides two levels of annotations:
fetal_kidney_predicted.compartment is one of the 4
main compartments composing a fetal kidney. We expect immune and
endothelial cells to be healthy (non-canerous) cells identified easily
with high confidency. We expect stroma and fetal nephron compartment to
contain both normal and malignant cells.
immune cells
stroma cells
fetal_nephron cells
endothelial cells
fetal_kidney_predicted.cell_type holds predicted
cell type annotations that details further the four compartments, as for
example:
immune subtypes: myeloid cells, lymphocytes,
…
stroma subtypes: mesenchymal stem cells,
fibroblasts, …
fetal_nephron subtypes: podocytes, uteric bud,
mesenchymal cells (that appear to be cap mesenchyme cells) …
endothelial cells
## [1] "seurat_clusters"
tmp <- table(srat$fetal_kidney_predicted.compartment)
percent.of.immune <- 100*tmp["immune"]/sum(tmp)
percent.of.endothelial <- 100*tmp["endothelium"]/sum(tmp)Of note, 0.8660364 % of the cells are labeled as immune cells and 0.8226098 % of the cells are labeled as endothelial cells.
Let’s have a look at the repartition of cells per patient:
##
## endothelium fetal_nephron immune stroma
## SCPCS000168 23 22319 30 104
## SCPCS000169 43 5245 38 1068
## SCPCS000170 8 25999 85 440
## SCPCS000171 1 1908 16 85
## SCPCS000172 12 8004 42 716
## SCPCS000173 5 4524 37 138
## SCPCS000174 8 3270 20 682
## SCPCS000175 243 1468 70 1008
## SCPCS000176 14 1107 25 337
## SCPCS000177 0 1736 2 24
## SCPCS000178 14 490 0 1
## SCPCS000179 25 6196 111 26
## SCPCS000180 5 3751 16 542
## SCPCS000181 5 4060 3 90
## SCPCS000182 4 10933 24 101
## SCPCS000183 61 800 88 2405
## SCPCS000184 39 3249 70 176
## SCPCS000185 66 1327 24 787
## SCPCS000186 3 648 22 22
## SCPCS000187 1 650 9 15
## SCPCS000188 20 3281 68 511
## SCPCS000189 4 3860 9 1
## SCPCS000190 0 1276 1 6
## SCPCS000191 5 454 15 402
## SCPCS000192 0 1498 27 0
## SCPCS000193 383 2429 188 3208
## SCPCS000194 234 7849 83 2646
## SCPCS000195 244 1217 238 1102
## SCPCS000196 0 2340 71 610
## SCPCS000197 4 873 0 3
## SCPCS000198 2 1160 10 0
## SCPCS000199 2 1985 7 6
## SCPCS000200 5 4098 23 261
## SCPCS000201 11 1738 32 544
## SCPCS000202 0 396 18 309
## SCPCS000203 13 2225 7 49
## SCPCS000204 23 3374 47 127
## SCPCS000205 92 9663 76 826
## SCPCS000206 8 7703 48 17
## SCPCS000208 18 12078 35 379
## [1] "seurat_clusters"
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
Here we evaluate with marker genes the identification of endothelial, immune cells and other cancer and non-cancerous sub-populations.
markers = list("malignant" = CellType_metadata$ENSEMBL_ID[which(CellType_metadata$cell_class == "malignant")],
"immune"= CellType_metadata$ENSEMBL_ID[which(CellType_metadata$cell_class == "immune")],
"endothelium"= CellType_metadata$ENSEMBL_ID[which(CellType_metadata$cell_class == "endothelium")],
"non-malignant"= CellType_metadata$ENSEMBL_ID[which(CellType_metadata$cell_class == "non-malignant")])
SCpubr::do_DotPlot(sample = srat,
group.by = "fetal_kidney_predicted.compartment",
features = markers,
axis.text.x.angle = 90,
flip = TRUE,
plot.title = "fetal_kidney_predicted.compartment"
) We look at the endothelial marker “ENSG00000110799” = “VWF”
visualize_feature(object = srat,
features = "ENSG00000110799",
group.by = "fetal_kidney_predicted.compartment")We look at the immune marker “ENSG00000081237” = “PTPRC” alias “CD45”
visualize_feature(object = srat,
features = "ENSG00000081237",
group.by = "fetal_kidney_predicted.compartment")
#### “fetal_kidney_predicted.cell_type”
SCpubr::do_DotPlot(sample = srat,
group.by = "fetal_kidney_predicted.cell_type",
features = markers,
axis.text.x.angle = 90,
flip = TRUE,
plot.title = "fetal_kidney_predicted.cell_type"
)
#### “fetal_full_predicted.annotation.l1”
SCpubr::do_DotPlot(sample = srat,
group.by = "fetal_full_predicted.annotation.l1",
features = markers,
axis.text.x.angle = 90,
flip = TRUE,
plot.title = "fetal_full_predicted.annotation.l1"
)
#### “cellassign_celltype_annotation”
SCpubr::do_DotPlot(sample = srat,
group.by = "cellassign_celltype_annotation",
features = markers,
axis.text.x.angle = 90,
flip = TRUE,
plot.title = "cellassign_celltype_annotation"
) Seurat clustersHere we check wether one (or more) Seurat cluster do
correspond to one label. This would allow us to be more confident in the
annotation of immune or endothelial cells for example.
p1 <- do_Table_Heatmap( srat,
last_group = "fetal_kidney_predicted.compartment",
first_group = "seurat_clusters"
)
p2 <- do_Table_Heatmap( srat,
last_group = "fetal_full_predicted.annotation.l1",
first_group = "seurat_clusters"
)
p3 <- do_Table_Heatmap( srat,
last_group = "singler_celltype_annotation",
first_group = "seurat_clusters"
)
p4 <- do_Table_Heatmap( srat,
last_group = "cellassign_celltype_annotation",
first_group = "seurat_clusters"
)
(p1|p2)/(p3|p4)singleR and
cellassign# record the versions of the packages used in this analysis and other environment information
sessionInfo()## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Europe/Vienna
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] org.Hs.eg.db_3.19.1 AnnotationDbi_1.66.0 IRanges_2.38.1
## [4] S4Vectors_0.42.1 Biobase_2.64.0 BiocGenerics_0.50.0
## [7] reshape2_1.4.4 patchwork_1.3.0 lubridate_1.9.3
## [10] forcats_1.0.0 stringr_1.5.1 dplyr_1.1.4
## [13] purrr_1.0.2 readr_2.1.5 tidyr_1.3.1
## [16] tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
## [19] SCpubr_2.0.2 sctransform_0.4.1 Seurat_5.1.0
## [22] SeuratObject_5.0.2 sp_2.1-4
##
## loaded via a namespace (and not attached):
## [1] RcppAnnoy_0.0.22 splines_4.4.1 later_1.3.2
## [4] ggplotify_0.1.2 polyclip_1.10-7 fastDummies_1.7.4
## [7] lifecycle_1.0.4 rprojroot_2.0.4 globals_0.16.3
## [10] lattice_0.22-6 vroom_1.6.5 MASS_7.3-61
## [13] crosstalk_1.2.1 magrittr_2.0.3 plotly_4.10.4
## [16] sass_0.4.9 rmarkdown_2.28 jquerylib_0.1.4
## [19] yaml_2.3.10 httpuv_1.6.15 spam_2.10-0
## [22] spatstat.sparse_3.1-0 reticulate_1.39.0 cowplot_1.1.3
## [25] pbapply_1.7-2 DBI_1.2.3 RColorBrewer_1.1-3
## [28] abind_1.4-8 zlibbioc_1.50.0 Rtsne_0.17
## [31] yulab.utils_0.1.7 GenomeInfoDbData_1.2.12 ggrepel_0.9.6
## [34] irlba_2.3.5.1 listenv_0.9.1 spatstat.utils_3.1-0
## [37] goftest_1.2-3 RSpectra_0.16-2 spatstat.random_3.3-2
## [40] fitdistrplus_1.2-1 parallelly_1.38.0 leiden_0.4.3.1
## [43] codetools_0.2-20 DT_0.33 tidyselect_1.2.1
## [46] UCSC.utils_1.0.0 farver_2.1.2 viridis_0.6.5
## [49] matrixStats_1.4.1 spatstat.explore_3.3-2 jsonlite_1.8.9
## [52] progressr_0.14.0 ggridges_0.5.6 survival_3.7-0
## [55] tools_4.4.1 ica_1.0-3 Rcpp_1.0.13
## [58] glue_1.7.0 gridExtra_2.3 xfun_0.47
## [61] GenomeInfoDb_1.40.1 withr_3.0.1 fastmap_1.2.0
## [64] fansi_1.0.6 digest_0.6.37 timechange_0.3.0
## [67] R6_2.5.1 mime_0.12 gridGraphics_0.5-1
## [70] colorspace_2.1-1 scattermore_1.2 tensor_1.5
## [73] spatstat.data_3.1-2 RSQLite_2.3.7 utf8_1.2.4
## [76] generics_0.1.3 data.table_1.16.0 httr_1.4.7
## [79] htmlwidgets_1.6.4 uwot_0.2.2 pkgconfig_2.0.3
## [82] gtable_0.3.5 blob_1.2.4 lmtest_0.9-40
## [85] XVector_0.44.0 htmltools_0.5.8.1 dotCall64_1.1-1
## [88] scales_1.3.0 png_0.1-8 spatstat.univar_3.0-1
## [91] knitr_1.48 rstudioapi_0.16.0 tzdb_0.4.0
## [94] nlme_3.1-166 cachem_1.1.0 zoo_1.8-12
## [97] KernSmooth_2.23-24 parallel_4.4.1 miniUI_0.1.1.1
## [100] pillar_1.9.0 grid_4.4.1 vctrs_0.6.5
## [103] RANN_2.6.2 promises_1.3.0 xtable_1.8-4
## [106] cluster_2.1.6 evaluate_1.0.0 cli_3.6.3
## [109] compiler_4.4.1 rlang_1.1.4 crayon_1.5.3
## [112] future.apply_1.11.2 labeling_0.4.3 plyr_1.8.9
## [115] fs_1.6.4 stringi_1.8.4 viridisLite_0.4.2
## [118] deldir_2.0-4 assertthat_0.2.1 munsell_0.5.1
## [121] Biostrings_2.72.1 lazyeval_0.2.2 spatstat.geom_3.3-3
## [124] Matrix_1.7-0 RcppHNSW_0.6.0 hms_1.1.3
## [127] bit64_4.5.2 future_1.34.0 KEGGREST_1.44.1
## [130] shiny_1.9.1 highr_0.11 ROCR_1.0-11
## [133] igraph_2.0.3 memoise_2.0.1 bslib_0.8.0
## [136] bit_4.5.0